{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyarrow.parquet as pq\n",
    "import pyarrow as pa\n",
    "import ujson"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. 处理Wiki数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "origin_wiki_file = './data/wiki.simple.txt'\n",
    "\n",
    "liness = []\n",
    "with open(origin_wiki_file, 'r', encoding='utf-8') as f:\n",
    "    lines = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['数学：\\n',\n",
       " '数学，是研究数量、结构以及空间等概念及其变化的一门学科，属于形式科学的一种。数学利用抽象化和逻辑推理，从计数、计算、量度、对物体形状及运动的观察发展而成。数学家们拓展这些概念，以公式化新的猜想，以及从选定的公理及定义出发，严谨地推导出一些定理。\\n',\n",
       " '基础数学的知识与运用是生活中不可或缺的一环。对数学基本概念的完善，早在古埃及、美索不达米亚及古印度历史上的古代数学文本便可观见，而在古希腊那里有更为严谨的处理。从那时开始，数学的发展便持续不断地小幅进展，至16世纪的文艺复兴时期，因为新的科学发现和数学革新两者的交互，致使数学的加速发展，直至今日。数学并成为许多国家及地区的教育中的一部分。\\n',\n",
       " '数学在许多领域都有应用，包括科学、工程、医学、经济学和金融学等。数学对这些领域的应用通常被称为应用数学，有时亦会激起新的数学发现，并导致全新学科的发展，例如物理学的实质性发展中建立的某些理论激发数学家对于某些问题的不同角度的思考。数学家也研究纯粹数学，就是数学本身的实质性内容，而不以任何实际应用为目标。许多研究虽然以纯粹数学开始，但其过程中也发现许多可用之处。\\n',\n",
       " '词源。\\n']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lines[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 删除空行\n",
    "new_lines = []\n",
    "for line in lines:\n",
    "    if line.strip() != '':\n",
    "        new_lines.append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10103707 9804337\n"
     ]
    }
   ],
   "source": [
    "print(len(lines), len(new_lines))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('./data/wiki.simple_new.txt', 'w', encoding='utf-8') as f:\n",
    "#     f.writelines(new_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tb = pa.Table.from_arrays([new_lines], names=['text'])\n",
    "# compression='GZIP'\n",
    "pq.write_table(table=tb, where='./data/wiki.simple.parquet', row_group_size=50000, data_page_size=50000, )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. 处理训练数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_data = []\n",
    "max_len = 384"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./data/bell_open_source/train_3.5M_CN.json', 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        item = ujson.loads(line)\n",
    "\n",
    "        if len(item['conversations']) != 2: continue\n",
    "\n",
    "        conversation = item['conversations']\n",
    "        txt = ''\n",
    "        if conversation[0]['from'] =='human':\n",
    "            txt = f\"{conversation[0]['value']}\\n{conversation[1]['value']}\"\n",
    "        else:\n",
    "            txt = f\"{conversation[1]['value']}\\n{conversation[0]['value']}\"\n",
    "\n",
    "        if len(txt) >= max_len: continue\n",
    "        my_data.append(txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "for file in ['./data/bell_open_source/train_2M_CN.json', './data/bell_open_source/Belle_open_source_1M.json']:\n",
    "    with open(file, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            item = ujson.loads(line)\n",
    "\n",
    "            if item['input'].strip() != '':\n",
    "                txt = f\"{item['instruction']}\\n{item['input']}\\n{item['output']}\"\n",
    "            else:\n",
    "                txt = f\"{item['instruction']}\\n{item['output']}\"\n",
    "            \n",
    "            if len(txt) == 0 or len(txt) >= max_len: continue\n",
    "            my_data.append(\n",
    "                    txt\n",
    "            )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3033566 ['请输入一段描述有关海洋保护的情景对话。\\nPerson A: 哇，这个海滩真美啊！\\nPerson B: 是啊，可惜现在海洋污染越来越严重了。\\nPerson A: 对啊，以前来海边能看到很多海星、海螺等生物，现在好难看到了。\\nPerson B: 这些可爱的生物其实都是海洋生态系统的重要组成部分，如果它们消失了，整个生态系统都会受影响。\\nPerson A: 我们该怎么做才能保护海洋呢？\\nPerson B: 首先，我们要遵守海滩的相关规定，不要随便乱扔垃圾。其次，减少使用一次性塑料制品，比如塑料袋、吸管等，这些东西会进入海洋，危害海洋生物。最后，我们可以积极参与各种海洋保护组织，呼吁更多人加入到保护海洋的行动中来。\\nPerson A: 听起来保护海洋是一项重要的任务，我们一定要做好自己的份内工作。\\nPerson B: 是的，每个人都能为保护海洋贡献自己的一份力量。', '为我寻找5个值得信任的汽车保险公司\\n1. State Farm\\n2. Geico\\n3. Allstate\\n4. Progressive\\n5. Farmers Insurance', 'A公司去年亏损了500万美元，今年净利润增长了50%，今年的净利润是多少？\\n今年净利润为750万美元', '介绍最适合初学者的编程语言\\nPython是最适合初学者学习的编程语言之一。Python具有以下优点：\\n1. 简单易学：Python拥有简单易懂的语法和基本代码结构，可以使初学者更快地掌握编程基础。\\n2. 应用广泛：Python可以用于很多不同的任务和领域，例如数据科学、人工智能、机器学习、网络编程等。\\n3. 强大的社区支持：Python具有庞大的开发者社区，可以获得丰富的资源和支持。\\n4. 多资源学习：Python的学习资源丰富，包括图书、视频、课程等。\\n总之，对于初学者来说，Python是一门非常好的编程语言，它容易入门，应用广泛，有强大的社区支持和丰富的学习资源。', '以下是一道小学数学题： \\n有一条直线，上面有 6 只蚂蚁，其中 4 只向左走，2 只向右走。问：向右走的蚂蚁数量占比是多少？\\nA. 25%\\nB. 33.3%\\nC. 50%\\nD. 66.6%\\n回答该问题。\\n向右走的蚂蚁数量占比为两只蚂蚁中向右走的蚂蚁数量与总蚂蚁数量的比值。\\n先算出总蚂蚁数量：6 只蚂蚁。\\n再算出向右走的蚂蚁数量：2 只蚂蚁。\\n将向右走的蚂蚁数量除以总蚂蚁数量，即得到向右走的蚂蚁数量占比。\\n计算式为：\\n2 ÷ 6 = 0.33\\n所以，向右走的蚂蚁数量占比为 33.3%，选 B。']\n"
     ]
    }
   ],
   "source": [
    "print(len(my_data), my_data[0:5])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "tb = pa.Table.from_arrays([my_data], names=['text'])\n",
    "# compression='GZIP'\n",
    "pq.write_table(table=tb, where='./data/bell_pretrain_3M.parquet', row_group_size=20480, data_page_size=20480, )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. 处理sft数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = []\n",
    "with open('./data/sft_0.8M_CN.json', 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        item = ujson.loads(line)\n",
    "\n",
    "        txt = f\"{item['instruction']}{item['output']}\"\n",
    "        \n",
    "        if len(txt) == 0 or len(txt) >= 320: continue\n",
    "        lines.append(\n",
    "                item\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "726475\n"
     ]
    }
   ],
   "source": [
    "print(len(lines))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tb = pa.Table.from_pylist(lines)\n",
    "# compression='GZIP'\n",
    "pq.write_table(table=tb, where='./data/sft_train_data.parquet', row_group_size=20480, data_page_size=20480, )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py310",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
